########################################################
# Project:    Talking to the Populist Radical Right
# Task:       The script produces the raw correspondence
#             analysis scores for the Danish parties
# Author:     Jan Schwalbach (21/07/2022)
########################################################

# Loading packages and data
library(quanteda.textmodels)
library(quanteda.textplots)

library(quanteda)
library(RCurl)
library(XML)
library(stringr)
library(lubridate)
library(caret)
library(e1071)
library(plyr)
library(ggplot2)
library(dplyr)
library(readtext)
library(data.table)
library(wordshoal)
library(DT)
library(xtable)
library(zoo)
library(tm)
library(ggrepel)
library(grid)
library(ngram)

load(file="Corpus_Denmark.Rdata")
corpusDK<-corpusDK[!corpusDK$sessiondate<"1998-03-11",]
corpusDK$CH <- as.numeric(corpusDK$CH)
corpusDK <- corpusDK[corpusDK$CH < 11,]
corpusDK <- corpusDK[corpusDK$CH != 0,]

### Subsetting the data set

# To get the scores for each type of debate, select one of the three options.

# 1. All speeches: No changes needed

# 2. Immigration subsetting

corpusDK1<-corpusDK[!corpusDK$migrationcount<=2,]
corpusDK2<-corpusDK[!corpusDK$migrationcount1<1,]
corpusDK <- rbind(corpusDK1, corpusDK2) 
corpusDK <- corpusDK[!duplicated(corpusDK), ]
corpusDK<-corpusDK[!corpusDK$migrationcount<=0,]

# 3. Education subsetting

corpusDK1<-corpusDK[!corpusDK$educationcount<=2,]
corpusDK2<-corpusDK[!corpusDK$educationcount1<1,]
corpusDK <- rbind(corpusDK1, corpusDK2) 
corpusDK <- corpusDK[!duplicated(corpusDK), ]
corpusDK<-corpusDK[!corpusDK$educationcount<=0,]

# all other speeches

#corpusDK<-corpusDK[corpusDK$eucount<=2,]
#corpusDK<-corpusDK[corpusDK$eucount1<1,]
#corpusDK<-corpusDK[corpusDK$migrationcount<=2,]
#corpusDK<-corpusDK[corpusDK$migrationcount1<1,]
#corpusDK<-corpusDK[corpusDK$educationcount<=2,]
#corpusDK<-corpusDK[corpusDK$educationcount1<1,]

# Deleting all speeches of parties that are not considered 
# in the analysis and the speaker.

corpusDK<-corpusDK[corpusDK$party!="-",]
corpusDK<-corpusDK[corpusDK$party!="SP",]
corpusDK<-corpusDK[corpusDK$party!="LA",]
corpusDK<-corpusDK[corpusDK$party!="T",]
corpusDK<-corpusDK[corpusDK$party!="ALT",]
corpusDK<-corpusDK[corpusDK$party!="UFG",]
corpusDK<-corpusDK[corpusDK$party!="KD",]
corpusDK<-corpusDK[corpusDK$party!="TF",]
corpusDK<-corpusDK[corpusDK$party!="SIU",]
corpusDK<-corpusDK[corpusDK$party!="IA",]
corpusDK<-corpusDK[corpusDK$party!="LH",]
corpusDK<-corpusDK[corpusDK$party!="FRI",]
corpusDK<-corpusDK[corpusDK$party!="KRF",]
corpusDK<-corpusDK[corpusDK$party!="UP",]
corpusDK<-corpusDK[corpusDK$party!="CD",]
corpusDK<-corpusDK[corpusDK$party!="FP",]
corpusDK<-corpusDK[corpusDK$party!="NY",]
corpusDK<-corpusDK[corpusDK$party!="FF",]
corpusDK<-corpusDK[corpusDK$party!="REP",]
corpusDK<-corpusDK[corpusDK$party!="SIUMUT",]
corpusDK<-corpusDK[corpusDK$party!="JF",]
corpusDK<-corpusDK[corpusDK$party!="NQ",]

# Rearranging date variables

corpusDK$partyyear <- paste(corpusDK$party,corpusDK$year)
corpusDK$date1 <- as.numeric(corpusDK$sessiondate)
corpusDK$quarter <- as.yearqtr(corpusDK$sessiondate)
corpusDK$partyquarter <- paste(corpusDK$party,corpusDK$quarter)
corpusDK$partydate <- paste(corpusDK$party,corpusDK$date1)
corpusDK$debateunique <- paste(corpusDK$date,corpusDK$debate)
corpusDK$speakerdebateunique <- paste(corpusDK$debateunique,corpusDK$speaker_raw)

# Deleting unnecessary variables 

corpusDK$rn <- NULL
corpusDK$raw_text.x <- NULL
corpusDK$speechnumber <- NULL
corpusDK$length <- NULL
corpusDK$chair <- NULL
corpusDK$eupolitycount <- NULL
corpusDK$eupoliticscount <- NULL
corpusDK$eupolicycount <- NULL
corpusDK$eucount <- NULL
corpusDK$eupres <- NULL
corpusDK$eupolitycount1 <- NULL
corpusDK$eupoliticscount1 <- NULL
corpusDK$eupolicycount1 <- NULL
corpusDK$eucount1 <- NULL
corpusDK$eupres1 <- NULL
corpusDK$month1 <- NULL

# Aggregating all speeches by a party on the debate level
library(plyr)

corpusDK$debateunique<-removePunctuation(corpusDK$debateunique)
corpusDK$debateparty <- paste(corpusDK$partyquarter,corpusDK$debateunique)
corpusnew <- ddply(corpusDK, .(debateparty), summarize,
                   sessiondate=paste(sessiondate,collapse=","),
                   CH=paste(CH,collapse=","),
                   partyquarter=paste(partyquarter,collapse=","),
                   debateunique=paste(debateunique,collapse=","),
                   raw_text2=paste(raw_text2,collapse=","),
                   type=paste(type,collapse=","),
                   party= paste(party,collapse=","))

corpusnew$party <- gsub("\\,.*","",corpusnew$party, fixed = FALSE)
corpusnew$CH <- gsub("\\,.*","",corpusnew$CH, fixed = FALSE)
corpusnew$type <- gsub("\\,.*","",corpusnew$type, fixed = FALSE)
corpusnew$sessiondate <- gsub("\\,.*","",corpusnew$sessiondate, fixed = FALSE)
corpusnew$debateunique <- gsub("\\,.*","",corpusnew$debateunique, fixed = FALSE)
corpusnew$partyquarter <- gsub("\\,.*","",corpusnew$partyquarter, fixed = FALSE)
corpusnew$partyquarter <- gsub("[A-Z]* ","",corpusnew$partyquarter, fixed = FALSE)
corpusDK <- corpusnew
corpusDK$text <- corpusDK$raw_text2
corpusDK <- as.data.frame(corpusDK)
corpusDK$partyyear <- paste(corpusDK$party,corpusDK$year)
corpusDK$sessiondate <- as.Date(corpusDK$sessiondate)
corpusDK$quarter <- as.yearqtr(corpusDK$sessiondate) # quarter
corpusDK$partyquarter <- paste(corpusDK$party,corpusDK$quarter)

# Keeping only debates when more than one party participated and 
# speeches with more than 50 words

corpusDK <-  corpusDK %>% group_by(debateunique) %>% filter(n()>2)
corpusDK <- as.data.frame(corpusDK)

corpusDK <- as.data.frame(corpusDK)
corpusDK$length <- str_count(corpusDK$text,'\\w+')
corpusDK<-corpusDK[corpusDK$length>=50,]

# Dividing the corpus into legislative periods
corpusDK$CH <- as.numeric(corpusDK$CH)

corpusDK1998 <- corpusDK[corpusDK$CH > 0 & corpusDK$CH < 5,]
corpusDK2001 <- corpusDK[corpusDK$CH > 4 & corpusDK$CH < 8,]
corpusDK2005 <- corpusDK[corpusDK$CH > 7 & corpusDK$CH < 11,]

### To get the scores for all/government/opposition debates,
### select one of the three options for the respective legislative debate

corpusDK1998g <- corpusDK1998[corpusDK1998$type == "government",]
corpusDK1998o <- corpusDK1998[corpusDK1998$type == "opposition",]

corpusDK2001g <- corpusDK2001[corpusDK2001$type == "government",]
corpusDK2001o <- corpusDK2001[corpusDK2001$type == "opposition",]

corpusDK2005g <- corpusDK2005[corpusDK2005$type == "government",]
corpusDK2005o <- corpusDK2005[corpusDK2005$type == "opposition",]

corpusDKcorpus <- corpus(corpusDK1998)
corpusDKcorpus <- corpus(corpusDK1998g)
corpusDKcorpus <- corpus(corpusDK1998o)

corpusDKcorpus <- corpus(corpusDK2001)
corpusDKcorpus <- corpus(corpusDK2001g)
corpusDKcorpus <- corpus(corpusDK2001o)

corpusDKcorpus <- corpus(corpusDK2005)
corpusDKcorpus <- corpus(corpusDK2005g)
corpusDKcorpus <- corpus(corpusDK2005o)

# Creating a dfm (removing punctuation, number, and stopwords)

Denmark_token <- tokens(corpusDKcorpus,
                        remove_punct = T,
                        remove_symbols = T,
                        remove_numbers = T
)

Denmark_token <- tokens_select(Denmark_token, pattern = stopwords("danish"), selection = "remove")

dfm_Denmark <- dfm(Denmark_token)


dfm_Denmark <- dfm_wordstem(dfm_Denmark, language = "danish")
#dfm_Denmark <- dfm_trim(dfm_Denmark, min_docfreq = 1)
dfm_Denmark <- dfm_group(dfm_Denmark, groups = party)

tmod_ca <- textmodel_ca(dfm_Denmark)

dat_ca <- data.frame(dim1 = coef(tmod_ca, doc_dim = 1)$coef_document, 
                     dim2 = coef(tmod_ca, doc_dim = 2)$coef_document)

textplot_scale1d(tmod_ca)

allvalues <- dat_ca
allvalues$party <- tmod_ca$rownames
allvalues$party <- gsub(" [0-9][0-9][0-9][0-9].*", "", allvalues$party, fixed = FALSE)  

# Plotting the respective values for the two most important dimensions

p <- ggplot(allvalues, aes(dim1,dim2))+ scale_fill_manual(values = c("#E7B800", "violetred4", "aquamarine4", "deeppink", "red", "red4", "dodgerblue4")) +
  geom_point(color = 'red') +
  theme_classic(base_size = 20)+ 
  geom_label_repel(aes(label = party,
                       fill = party), color = 'white',
                   size = 5) + theme(legend.position = "none")+ labs(title = "Debates Denmark", y="Dimension 2", x = "Dimension 1")
p

### Creating data sets for the analysis for:

# all debates

CA_DK_all <- allvalues
names(CA_DK_all)[1] <- "CA_DK_all_D1"
names(CA_DK_all)[2] <- "CA_DK_all_D2"

CA_DK_all_gov <- allvalues
names(CA_DK_all_gov)[1] <- "CA_DK_all_gov_D1"
names(CA_DK_all_gov)[2] <- "CA_DK_all_gov_D2"

CA_DK_all_op <- allvalues
names(CA_DK_all_op)[1] <- "CA_DK_all_op_D1"
names(CA_DK_all_op)[2] <- "CA_DK_all_op_D2"

Positions_CA_DK_ALL <- cbind(CA_DK_all,CA_DK_all_gov,CA_DK_all_op)

save(Positions_CA_DK_ALL, file = "CA_DK_ALL.Rdata") # save corpus
save(Positions_CA_DK_ALL, file = "CA_DK_ALL_01.Rdata") # save corpus
save(Positions_CA_DK_ALL, file = "CA_DK_ALL_05.Rdata") # save corpus

# Immigration debates

CA_DK_MIG <- allvalues
names(CA_DK_MIG)[1] <- "CA_DK_MIG_D1"
names(CA_DK_MIG)[2] <- "CA_DK_MIG_D2"

CA_DK_MIG_gov <- allvalues
names(CA_DK_MIG_gov)[1] <- "CA_DK_MIG_gov_D1"
names(CA_DK_MIG_gov)[2] <- "CA_DK_MIG_gov_D2"

CA_DK_MIG_op <- allvalues
names(CA_DK_MIG_op)[1] <- "CA_DK_MIG_op_D1"
names(CA_DK_MIG_op)[2] <- "CA_DK_MIG_op_D2"

Positions_CA_DK_MIG <- cbind(CA_DK_MIG,CA_DK_MIG_gov,CA_DK_MIG_op)

save(Positions_CA_DK_MIG, file = "CA_DK_MIG.Rdata") # save corpus
save(Positions_CA_DK_MIG, file = "CA_DK_MIG_01.Rdata") # save corpus
save(Positions_CA_DK_MIG, file = "CA_DK_MIG_05.Rdata") # save corpus

# Education debates

CA_DK_ED <- allvalues
names(CA_DK_ED)[1] <- "CA_DK_ED_D1"
names(CA_DK_ED)[2] <- "CA_DK_ED_D2"

CA_DK_ED_gov <- allvalues
names(CA_DK_ED_gov)[1] <- "CA_DK_ED_gov_D1"
names(CA_DK_ED_gov)[2] <- "CA_DK_ED_gov_D2"

CA_DK_ED_op <- allvalues
names(CA_DK_ED_op)[1] <- "CA_DK_ED_op_D1"
names(CA_DK_ED_op)[2] <- "CA_DK_ED_op_D2"

Positions_CA_DK_ED <- cbind(CA_DK_ED,CA_DK_ED_gov,CA_DK_ED_op)

save(Positions_CA_DK_ED, file = "CA_DK_ED.Rdata") # save corpus
save(Positions_CA_DK_ED, file = "CA_DK_ED_01.Rdata") # save corpus
save(Positions_CA_DK_ED, file = "CA_DK_ED_05.Rdata") # save corpus
